############################################################
# Summary: This script predicts 

# Output:
# - Figure 4: Confusion Matrices

# 1) grab best C for the linear SVM per dataset
bestC_tbl <- final_best_df %>%
  filter(kernel == "svmLinear") %>%
  distinct(dataset, C)

# quick helper to fetch the numeric best C for a dataset
bestC_for <- function(ds) {
  cval <- bestC_tbl %>% filter(dataset == ds) %>% pull(C)
  if (length(cval) != 1) stop(sprintf("Best C not found or ambiguous for dataset '%s'", ds))
  cval
}

# 2) path builders (ADJUST these templates to your actual file layout if needed)
#    For example, this assumes files like:
#    data/models/svmLinear/<dataset>/seed_<SEED>/C_<C>/model.rds
#    data/features/svmLinear/<dataset>/seed_<SEED>/C_<C>/features.rds
model_base   <- "models/models/svmLinear"
feature_base <- "models/features/svmLinear"

fmtC <- function(C) format(C, scientific = FALSE, trim = TRUE)  # avoid 1e+00 style dirs

model_path_for <- function(ds, seed, C)
  sprintf("%s/%s/seed_%s/C_%s/model.rds",   model_base,   ds, seed, fmtC(C))

feature_path_for <- function(ds, seed, C)
  sprintf("%s/%s/seed_%s/C_%s/features.rds", feature_base, ds, seed, fmtC(C))

make_paths <- function(ds, seeds, C) {
  list(
    model_paths   = vapply(seeds, function(s) model_path_for(ds, s, C),   character(1)),
    feature_paths = vapply(seeds, function(s) feature_path_for(ds, s, C), character(1))
  )
}

# 3) build the four path sets using best C values and your training seeds
C_strong_unbalanced   <- bestC_for("strong_unbalanced")
C_strong_balanced     <- bestC_for("strong_balanced")
C_combined_unbalanced <- bestC_for("combined_unbalanced")
C_combined_balanced   <- bestC_for("combined_balanced")

paths_strong_unbalanced    <- make_paths("strong_unbalanced",    seeds_more, C_strong_unbalanced)
paths_strong_balanced      <- make_paths("strong_balanced",      seeds_more, C_strong_balanced)
paths_combined_unbalanced  <- make_paths("combined_unbalanced",  seeds_more, C_combined_unbalanced)
paths_combined_balanced    <- make_paths("combined_balanced",    seeds_more, C_combined_balanced)

# expose the exact variables your downstream code expects
model_paths_strong_unbalanced     <- paths_strong_unbalanced$model_paths
feature_paths_strong_unbalanced   <- paths_strong_unbalanced$feature_paths

model_paths_strong_balanced       <- paths_strong_balanced$model_paths
feature_paths_strong_balanced     <- paths_strong_balanced$feature_paths

model_paths_combined_unbalanced   <- paths_combined_unbalanced$model_paths
feature_paths_combined_unbalanced <- paths_combined_unbalanced$feature_paths

model_paths_combined_balanced     <- paths_combined_balanced$model_paths
feature_paths_combined_balanced   <- paths_combined_balanced$feature_paths

# (Optional) sanity check: print one example path from each set
message("Example paths:\n",
        " strong_unbalanced model: ", model_paths_strong_unbalanced[1], "\n",
        " strong_balanced   model: ", model_paths_strong_balanced[1], "\n",
        " combined_unbal    model: ", model_paths_combined_unbalanced[1], "\n",
        " combined_bal      model: ", model_paths_combined_balanced[1])

# ---- Your analysis block (unchanged, now using best-C paths) ----

test_sentences <- read_csv("data/test_sentences/test_sentences.csv")

# 1) Strong Unbalanced
df_unlabeled_strong_unbalanced <- predict_unlabeled_all_seeds(
  df_unlabeled  = test_sentences,
  model_paths   = model_paths_strong_unbalanced,
  feature_paths = feature_paths_strong_unbalanced,
  text_field    = "text",
  docid_field   = "text"
)

# 2) Strong Balanced
df_unlabeled_strong_balanced <- predict_unlabeled_all_seeds(
  df_unlabeled  = test_sentences,
  model_paths   = model_paths_strong_balanced,
  feature_paths = feature_paths_strong_balanced,
  text_field    = "text",
  docid_field   = "text"
)


# 3) Combined Unbalanced
df_unlabeled_combined_unbalanced <- predict_unlabeled_all_seeds(
  df_unlabeled  = test_sentences,
  model_paths   = model_paths_combined_unbalanced,
  feature_paths = feature_paths_combined_unbalanced,
  text_field    = "text",
  docid_field   = "text"
)

# 4) Combined Balanced
df_unlabeled_combined_balanced <- predict_unlabeled_all_seeds(
  df_unlabeled  = test_sentences,
  model_paths   = model_paths_combined_balanced,
  feature_paths = feature_paths_combined_balanced,
  text_field    = "text",
  docid_field   = "text"
)

df_pred <- dplyr::bind_rows(
  df_unlabeled_strong_balanced       %>% dplyr::mutate(strategy = "svm", method = "svm", dataset = "strong_balanced"),
  df_unlabeled_strong_unbalanced     %>% dplyr::mutate(strategy = "svm", method = "svm", dataset = "strong_unbalanced"),
  df_unlabeled_combined_unbalanced   %>% dplyr::mutate(strategy = "svm", method = "svm", dataset = "combined_unbalanced"),
  df_unlabeled_combined_balanced     %>% dplyr::mutate(strategy = "svm", method = "svm", dataset = "combined_balanced")
)

df_pred |> 
  group_by(dataset, pred_label_seed) |> 
  summarise(count = sum(!is.na(pred_label_seed)))

readr::write_csv(df_pred, "data/results/test_sentences_prediction_svm.csv")

